{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys,os\n",
    "import csv\n",
    "import pickle\n",
    "import scipy\n",
    "import numpy as np\n",
    "from IPython.display import clear_output\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "import sklearn.metrics\n",
    "import sklearn.model_selection\n",
    "import itertools\n",
    "import json\n",
    "from train_configurations import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Grid Search and Model Training\n",
    "\n",
    "This notebook is for training a specific model for each nation. Be sure to have run the nb __00_generate_bag_of_words.ipynb__ before running this.<br>\n",
    "The nb performs a grid search over a predefined set of parameters saved in __train_configuratons.py__ with K-fold cross validation.\n",
    "\n",
    "Select the desired nation, model and metaparameters in the next cell:\n",
    "\n",
    "- nation = the desired nation, possible values are {IT,FR,SP,GE,NE,AU, IT_speeches, IT_manual}\n",
    "- model_type = the type of classifier to use {RandomForest, GradientBoosting, NeuralNetwork}\n",
    "- target_score = select which score will be used to pick the best model in the grid search. Possible values are:\n",
    "    - AUC = Area Under ROC \n",
    "    - Accuracy = classification accuracy\n",
    "    - F1 = f1 score\n",
    "- n_splits = the number of folds for the K-fold cross validation\n",
    "- p_train = the fracion of data used for training and validation, must be in [0,1]\n",
    "- random_state = seed for the model initializations and the random number generators\n",
    "- n_jobs = number of cores to use during the grid-search\n",
    "\n",
    "\n",
    "The results will be saved in the \"model\" folder."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Set config variables here:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nation=\"IT\"\n",
    "model_type = \"RandomForest\"\n",
    "target_score = \"AUC\"\n",
    "n_splits = 5\n",
    "p_train = 0.7\n",
    "random_state = 1\n",
    "n_jobs = 8"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read Bag-of-words Data and Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = pickle.load(open(\"./bow_and_labels/X_{}_sentences.pkl\".format(nation), \"rb\"))\n",
    "Y = pickle.load(open(\"./bow_and_labels/Y_{}_sentences.pkl\".format(nation), \"rb\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Splitting train+validation and test sets\")\n",
    "np.random.seed(random_state)\n",
    "indexes = np.random.permutation(range(X.shape[0]))\n",
    "n_train = int(p_train*X.shape[0])\n",
    "indexes_train = indexes[:n_train]\n",
    "indexes_test = indexes[n_train:]\n",
    "X_train, Y_train = X[indexes_train], Y[indexes_train]\n",
    "X_test, Y_test = X[indexes_test], Y[indexes_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"training {0} for {1} with {2} as target score\".format(nation,model_type,target_score))\n",
    "\n",
    "training_results = {\n",
    "    \"nation\": nation,\n",
    "    \"model_type\":model_type,\n",
    "    \"target_score\":target_score,\n",
    "    \"random_state\": random_state,\n",
    "    \"N_sentences\": nation_stats[nation][\"N_sentences\"],\n",
    "    \"frac_sentences\": nation_stats[nation][\"frac_sentences\"],\n",
    "}\n",
    "\n",
    "import time\n",
    "t_start = time.time()\n",
    "\n",
    "scoring = {'AUC': 'roc_auc', 'Accuracy': \"accuracy\", \"F1\":\"f1\"}\n",
    "\n",
    "\n",
    "cv = sklearn.model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=random_state)\n",
    "\n",
    "if model_type == \"RandomForest\":\n",
    "    cw = None\n",
    "    if nation in [\"IT_manual\"]: cw = \"balanced_subsample\"\n",
    "    model = RandomForestClassifier(random_state=random_state,class_weight = cw)\n",
    "elif model_type == \"GradientBoosting\":\n",
    "    model = GradientBoostingClassifier(random_state=random_state)\n",
    "elif model_type == \"NeuralNetwork\":\n",
    "    model = MLPClassifier(random_state=1)\n",
    "elif model_type == \"Logistic\":\n",
    "    model = LogisticRegression(random_state=1,fit_intercept=False)\n",
    "else:\n",
    "    raise RuntimeError(\"Unspecified model. Select between RandomForest - GradientBoosting - NeuralNetwork - Logistic\")\n",
    "\n",
    "\n",
    "# define search\n",
    "search = sklearn.model_selection.GridSearchCV(model, param_space[model_type], scoring=scoring, cv=cv, refit=target_score,n_jobs=n_jobs, verbose=10)\n",
    "result = search.fit(X_train, Y_train)\n",
    "best_model = result.best_estimator_\n",
    "\n",
    "\n",
    "\n",
    "# report progress\n",
    "best_index = search.cv_results_[\"params\"].index(search.best_params_)\n",
    "n_splits = search.cv.n_splits\n",
    "for k in scoring:\n",
    "    avg_score = [search.cv_results_['split{0}_test_{1}'.format(split,k)][best_index] for split in range(n_splits)]\n",
    "    print(\"{0} Valid = {1} +/- {2}\".format(k, np.mean(avg_score), np.sqrt(np.var(avg_score)/len(avg_score))))\n",
    "    training_results[k] = np.mean(avg_score)\n",
    "    training_results[k+\"_err\"] = np.sqrt(np.var(avg_score)/len(avg_score))\n",
    "    \n",
    "    \n",
    "print(\"best parameters:\")\n",
    "print(search.best_params_)\n",
    "\n",
    "training_results[\"best_params\"] = search.best_params_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Threshold\n",
    "\n",
    "Selecting probability threshold with Youdens method. This will increase classification accuracy.\n",
    "\n",
    "We do this by iterating over the $K$-fold splits and computing the Youdens metric for each. We also compute accuracy scores for train sets in each split.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_thresholds = []\n",
    "all_aurocs_train = []\n",
    "all_accuracies_train = []\n",
    "all_F1_train = []\n",
    "\n",
    "for train_index_batch, valid_index_batch in cv.split(X_train, Y_train):\n",
    "    X_batch = X_train[valid_index_batch]\n",
    "    Y_batch = Y_train[valid_index_batch]\n",
    "    \n",
    "    X_batch_train = X_train[train_index_batch]\n",
    "    Y_batch_train = Y_train[train_index_batch]\n",
    "    Y_batch_pred = best_model.predict_proba(X_batch_train)[:,1]\n",
    "    \n",
    "    ###################################\n",
    "    \n",
    "    fpr, tpr, thresholds = sklearn.metrics.roc_curve(Y_batch_train, Y_batch_pred,drop_intermediate=False)\n",
    "    tnr = 1 - fpr\n",
    "    fnr = 1 - tpr\n",
    "    youdens = tpr/(tpr+fnr) + tnr/(tnr+fpr) - 1 \n",
    "    max_threshold = thresholds[youdens.argmax()]\n",
    "    all_thresholds.append(max_threshold)\n",
    "    ###################################\n",
    "    Y_batch_class= Y_batch_pred>max_threshold\n",
    "    \n",
    "    auroc_train = sklearn.metrics.roc_auc_score(Y_batch_train, Y_batch_pred)\n",
    "    accuracy_train = sklearn.metrics.accuracy_score(Y_batch_train, Y_batch_class)\n",
    "    F1_train = sklearn.metrics.f1_score(Y_batch_train, Y_batch_class)\n",
    "    \n",
    "    all_aurocs_train.append(auroc_train)\n",
    "    all_accuracies_train.append(accuracy_train)\n",
    "    all_F1_train.append(F1_train)\n",
    "    \n",
    "\n",
    "    \n",
    "    \n",
    "print(\"Avg. AUC on train set = {0} +/- {1}\".format(np.mean(all_aurocs_train), np.sqrt(np.var(all_aurocs_train)/len(all_aurocs_train))))\n",
    "print(\"Avg. Accuracy on train set = {0} +/- {1}\".format(np.mean(all_accuracies_train), np.sqrt(np.var(all_accuracies_train)/len(all_accuracies_train))))\n",
    "print(\"Avg. F1 on train set = {0} +/- {1}\".format(np.mean(all_F1_train), np.sqrt(np.var(all_F1_train)/len(all_F1_train))))    \n",
    "    \n",
    "max_threshold = np.mean(all_thresholds)\n",
    "training_results[\"threshold\"] = max_threshold\n",
    "\n",
    "\n",
    "training_results[\"AUC_train\"] = np.mean(all_aurocs_train)\n",
    "training_results[\"F1_train\"] = np.mean(all_accuracies_train)\n",
    "training_results[\"Accuracy_train\"] = np.mean(all_F1_train)\n",
    "\n",
    "\n",
    "training_results[\"AUC_train_err\"] = np.sqrt(np.var(all_aurocs_train)/len(all_aurocs_train))\n",
    "training_results[\"F1_train_err\"] = np.sqrt(np.var(all_accuracies_train)/len(all_accuracies_train))\n",
    "training_results[\"Accuracy_train_err\"] = np.sqrt(np.var(all_F1_train)/len(all_F1_train))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Check on Validation Set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# report progress\n",
    "best_index = search.cv_results_[\"params\"].index(search.best_params_)\n",
    "n_splits = search.cv.n_splits\n",
    "for k in scoring:\n",
    "    avg_score = [search.cv_results_['split{0}_test_{1}'.format(split,k)][best_index] for split in range(n_splits)]\n",
    "    print(\"{0} on validation set = {1} +/- {2}\".format(k, np.mean(avg_score), np.sqrt(np.var(avg_score)/len(avg_score))))\n",
    "    \n",
    "    training_results[\"{}_valid\".format(k)] = np.mean(avg_score)\n",
    "    training_results[\"{}_valid_err\".format(k)] = np.sqrt(np.var(avg_score)/len(avg_score))\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Check on Test Set\n",
    "\n",
    "We check the goodness of the model on the test set, comparing it with a random classifier that predict $0$ or $1$ with a probability of $0.5$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y_test_pred = best_model.predict_proba(X_test)[:,1]\n",
    "Y_test_classpred = best_model.predict(X_test)\n",
    "Y_test_class= Y_test_pred>max_threshold\n",
    "\n",
    "auroc_test = sklearn.metrics.roc_auc_score(Y_test, Y_test_pred)\n",
    "accuracy_test = sklearn.metrics.accuracy_score(Y_test, Y_test_class)\n",
    "F1_test = sklearn.metrics.f1_score(Y_test, Y_test_class)\n",
    "\n",
    "print(\"AUC on test set= \", auroc_test)\n",
    "print(\"Accuracy on test set = \", accuracy_test)\n",
    "print(\"F1 on test set = \", F1_test)\n",
    "training_results[\"AUC_test\"] = auroc_test\n",
    "training_results[\"F1_test\"] = F1_test\n",
    "training_results[\"Accuracy_test\"] = accuracy_test\n",
    "\n",
    "########################\n",
    "\n",
    "dummy_class = sklearn.dummy.DummyClassifier(strategy='uniform')\n",
    "dummy_class.fit(X_train, Y_train)\n",
    "Y_test_class = dummy_class.predict(X_test)\n",
    "\n",
    "accuracy_test = sklearn.metrics.accuracy_score(Y_test, Y_test_class)\n",
    "F1_test = sklearn.metrics.f1_score(Y_test, Y_test_class)\n",
    "\n",
    "print(\"Accuracy on test set (dummy classifier) = \", accuracy_test)\n",
    "print(\"F1 on test set (dummy classifier) = \", F1_test)\n",
    "\n",
    "training_results[\"F1_test_dummy\"] = F1_test\n",
    "training_results[\"Accuracy_test_dummy\"] = accuracy_test\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Saving results\n",
    "\n",
    "Model's best meta-parameters, threshold and parameters will be saved into the __models__ folder.<br>\n",
    "A recap of the training will be saved into the \"training_results.json\" file. Running the same configuration twice will overwrite the previous one."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = result.best_params_.copy()\n",
    "params[\"threshold\"] = max_threshold\n",
    "\n",
    "pickle.dump(params, open(\"./models/{0}_{1}_{2}_{3}_best_model_params.pkl\".format(nation, model_type,target_score,random_state), \"wb\"))\n",
    "pickle.dump(best_model, open(\"./models/{0}_{1}_{2}_{3}_best_model.pkl\".format(nation, model_type,target_score,random_state), \"wb\"))\n",
    "pickle.dump(indexes_test, open(\"./models/{0}_{1}_{2}_{3}_test_indexes.pkl\".format(nation, model_type,target_score,random_state), \"wb\"))\n",
    "pickle.dump(search, open(\"./models/{0}_{1}_{2}_{3}_search.pkl\".format(nation, model_type,target_score,random_state), \"wb\"))\n",
    "\n",
    "\n",
    "if not os.path.isfile(\"./training_results.json\"):\n",
    "    json.dump([training_results],open(\"./training_results.json\", \"w\"))\n",
    "else:\n",
    "    training_results_old = json.load(open(\"./training_results.json\", \"r\"))\n",
    "    \n",
    "    found_flag = False\n",
    "    for index, res in enumerate(training_results_old):\n",
    "        if res[\"nation\"] == training_results[\"nation\"] and \\\n",
    "            res[\"model_type\"] == training_results[\"model_type\"] and \\\n",
    "                res[\"target_score\"] == training_results[\"target_score\"] and \\\n",
    "                    res[\"random_state\"] == training_results[\"random_state\"]:\n",
    "                        found_flag = True\n",
    "                        break\n",
    "                \n",
    "    if found_flag:\n",
    "        print(\"same configuration found!\")\n",
    "        print(\"deleting old results and overwriting..\")\n",
    "        del training_results_old[index]\n",
    "        \n",
    "    print(\"saving..\")\n",
    "    training_results_old.append(training_results)\n",
    "    json.dump(training_results_old,open(\"./training_results.json\", \"w\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
